import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, KFold, StratifiedShuffleSplit
from sklearn import metrics
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
from matplotlib.font_manager import FontProperties
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In this article, we compare a number of classification methods for the breast cancer dataset. The details regarding this dataset can be found in Diagnostic Wisconsin Breast Cancer Database [1]. We would use the following classification methods and then compare them in terms of performance.
data = load_breast_cancer()
df = pd.DataFrame(data['data'], columns = [x.title() for x in data['feature_names']])
Labels = [x.title() for x in data['target_names'].tolist()]
df['Target'] = data['target']
df['Diagnosis'] = df['Target'].map(lambda x: Labels[1] if x == 0 else Labels[0])
df = df.drop(columns = ['Target'])
display(df)
df.to_csv('Data/bcw_dataset.csv')
| Mean Radius | Mean Texture | Mean Perimeter | Mean Area | Mean Smoothness | Mean Compactness | Mean Concavity | Mean Concave Points | Mean Symmetry | Mean Fractal Dimension | ... | Worst Texture | Worst Perimeter | Worst Area | Worst Smoothness | Worst Compactness | Worst Concavity | Worst Concave Points | Worst Symmetry | Worst Fractal Dimension | Diagnosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | 0.2419 | 0.07871 | ... | 17.33 | 184.60 | 2019.0 | 0.16220 | 0.66560 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | Benign |
| 1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | 0.1812 | 0.05667 | ... | 23.41 | 158.80 | 1956.0 | 0.12380 | 0.18660 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | Benign |
| 2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | 0.2069 | 0.05999 | ... | 25.53 | 152.50 | 1709.0 | 0.14440 | 0.42450 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | Benign |
| 3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | 0.2597 | 0.09744 | ... | 26.50 | 98.87 | 567.7 | 0.20980 | 0.86630 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | Benign |
| 4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | 0.1809 | 0.05883 | ... | 16.67 | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | Benign |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 564 | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | 0.1726 | 0.05623 | ... | 26.40 | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 | Benign |
| 565 | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | 0.1752 | 0.05533 | ... | 38.25 | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 | Benign |
| 566 | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | 0.1590 | 0.05648 | ... | 34.12 | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 | Benign |
| 567 | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | 0.2397 | 0.07016 | ... | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 | Benign |
| 568 | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | 0.1587 | 0.05884 | ... | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 | Malignant |
569 rows × 31 columns
As can be seen, the number of instances is 569 and the number of attributes is 32. The object of the exercise is to create a classification model that can classify the type of Diagnosis base on the rest of the attributes. However, first, let's plot a count plot for Diagnosis attribute.
X, y = data.data, data.target
Moreover, high variance for some features can hurt our modeling process. For this reason, we would like to standardize features by removing the mean and scaling to unit variance.
X = pd.DataFrame(data = X, columns = [x.title() for x in data['feature_names']])
# scaling data
scaler = preprocessing.StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(data = X_std, columns = [x.title() for x in data['feature_names']])
del scaler
fig, ax = plt.subplots(2, 1, figsize=(20, 8))
ax = ax.ravel()
font = FontProperties()
font.set_weight('bold')
CP = [sns.color_palette("OrRd", 20), sns.color_palette("Greens", 20)]
Names = ['Variance of the Features', 'Variance of the Features (Standardized)']
Sets = [X, X_std]
kws = dict(label='Feature\nVariance', aspect=20, shrink= .3)
for i in range(len(ax)):
Temp = Sets[i].var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax[i], annot=True, square=True, cmap = CP[i],
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": 8},
cbar_kws=kws)
_ = ax[i].set_yticklabels('')
_ = ax[i].set_title(Names[i], fontproperties=font, fontsize = 16)
del Temp
X = X_std.copy()
del CP, Names, ax, fig, font, Sets, kws, X_std
def Dist_Table(Inp, Target):
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(dict(zip([0,1],Labels)))
Table['Percentage'] = 100 - np.round(100*(Table['Count']/Table['Count'].sum()),2)
return Table
def Dist_Plot(Table, Target, PieColors = ['FireBrick','SeaGreen'], TableColors = ['Navy','White']):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=[0.6, 0.4],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values, pull=[0, 0.1], textfont=dict(size=16),
marker=dict(colors = PieColors, line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=.5)
fig.update_layout(height = 400, legend=dict(orientation="v"), legend_title_text= Target)
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = [0.4, 0.2, 0.2],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + 'Distribution' + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Table = Dist_Table(Inp = df, Target = 'Diagnosis')
Dist_Plot(Table, Target = 'Diagnosis')
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
# X
if isinstance(X, pd.DataFrame):
X_train, X_test = X.loc[train_index], X.loc[test_index]
else:
X_train, X_test = X[train_index], X[test_index]
# y
if isinstance(y, pd.Series):
y_train, y_test = y[train_index], y[test_index]
else:
y_train, y_test = y[train_index], y[test_index]
del sss
def TrainTestSplitPlot(y_train, y_test, Target):
Colors = ['FireBrick','SeaGreen']
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}]*2])
if not isinstance(y, pd.Series):
Temp = pd.Series(y_train)
else:
Temp = y_train.copy()
fig.add_trace(go.Pie(labels=Labels, values=Temp.value_counts().values, pull=[0, 0.1], name= 'Train Set',
textfont=dict(size=16), marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 1)
if not isinstance(y, pd.Series):
Temp = pd.Series(y_test)
else:
Temp = y_test.copy()
fig.add_trace(go.Pie(labels=Labels,
values=Temp.value_counts().values,
pull=[0, 0.1],
name= 'Test Set',
textfont=dict(size=16),
marker= dict(colors = Colors, line=dict(color='black', width=1))), 1, 2)
fig.update_traces(hole=.5)
fig.update_layout(height = 400, legend=dict(orientation="v"),
legend_title_text= Target,
annotations=[dict(text= '<b>' + 'Train<br>Set' + '<b>', x=0.195, y=0.5, font_size=14, showarrow=False),
dict(text= '<b>' + 'Test<br>Set' + '<b>', x=0.8, y=0.5, font_size=14, showarrow=False)],
title={'text': '<b>' + Target + '<b>', 'x':0.48, 'y': .83, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
TrainTestSplitPlot(y_train, y_test, Target = 'Diagnosis')
display(pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T)
| Set | X_train | X_test | y_train | y_test |
|---|---|---|---|---|
| Shape | (398, 30) | (171, 30) | (398,) | (171,) |
A random forest classifier (RFC) fits several decision tree classifiers on (using sub-samples of the dataset) and then averages them to improve the predictive accuracy. See sklearn.ensemble.RandomForestClassifier for more details.
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
def Best_Parm(model, param_dist, Top = None, X = X, y = y, n_splits = 20, scoring = 'precision', H = 600, titleY = .95):
grid = RandomizedSearchCV(estimator = model, param_distributions = param_dist,
cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=Test_Size, random_state=42),
n_iter = int(1e3), scoring = scoring, error_score = 0, verbose = 0,
n_jobs = 10, return_train_score = True)
_ = grid.fit(X, y)
Table = Grid_Table(grid)
if Top == None:
Top = Table.shape[0]
Table = Table.iloc[:Top,:]
# Table
T = Table.copy()
T['Train Score'] = T['Mean Train Score'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Train Score'].map(lambda x: ('%.2e' % x))
T['Test Score'] = T['Mean Test Score'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Test Score'].map(lambda x: ('%.2e' % x))
T['Fit Time'] = T['Mean Fit Time'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Fit Time'].map(lambda x: ('%.2e' % x))
T = T.drop(columns = ['Mean Train Score','STD Train Score','Mean Test Score','STD Test Score','Mean Fit Time','STD Fit Time'])
display(T.head(Top).style.hide_index().background_gradient(subset= ['Rank Test Score'],
cmap=sns.diverging_palette(145, 300, s=60, as_cmap=True)).\
set_properties(subset=['Params'], **{'background-color': 'Indigo', 'color': 'White'}).\
set_properties(subset=['Train Score'], **{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Test Score'], **{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Fit Time'], **{'background-color': 'Linen', 'color': 'Black'}))
# Plot
Grid_Performance_Plot(Table, n_splits = n_splits, H = H, titleY = titleY)
return grid
def Grid_Table(grid):
Table = pd.DataFrame({'Rank Test Score': grid.cv_results_['rank_test_score'],
'Params':[str(s).replace('{', '').replace('}', '').\
replace("'", '') for s in grid.cv_results_['params']],
# Train
'Mean Train Score': grid.cv_results_['mean_train_score'],
'STD Train Score': grid.cv_results_['std_train_score'],
# Test
'Mean Test Score': grid.cv_results_['mean_test_score'],
'STD Test Score': grid.cv_results_['std_test_score'],
# Fit time
'Mean Fit Time': grid.cv_results_['mean_fit_time'],
'STD Fit Time': grid.cv_results_['std_fit_time']})
Table = Table.sort_values('Rank Test Score').reset_index(drop = True)
return Table
def Grid_Performance_Plot(Table, n_splits, H = 550, titleY =.95):
Temp = Table['Mean Train Score']-Table['STD Train Score']
Temp = np.append(Temp, Table['Mean Test Score']-Table['STD Test Score'])
L = np.floor((Temp*100- Temp)).min()/100
Temp = Table['Mean Train Score']+Table['STD Train Score']
Temp = np.append(Temp, Table['Mean Test Score']+Table['STD Test Score'])
R = np.ceil((Temp*100 + Temp)).max()/100
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True,
subplot_titles=('<b>' + 'Train Set' + '<b>', '<b>' + 'Test Set' + '<b>'))
fig.add_trace(go.Scatter(x= Table['Params'], y= Table['Mean Train Score'], showlegend=False, marker_color= 'SeaGreen',
error_y=dict(type='data',array=Table['STD Train Score'], visible=True)), 1, 1)
fig.add_trace(go.Scatter(x= Table['Params'], y= Table['Mean Test Score'], showlegend=False, marker_color= 'RoyalBlue',
error_y=dict(type='data',array= Table['STD Test Score'], visible=True)), 1, 2)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= [L, R])
fig.update_yaxes(title_text="Mean Score", row=1, col=1)
fig.update_layout(plot_bgcolor= 'white', width = 980, height = H,
title={'text': '<b>' + 'RandomizedSearchCV with %i-fold cross validation' % n_splits + '<b>',
'x':0.5, 'y':titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
def Stratified_CV_Scoring(model, X = X, y = y, n_splits = 10):
sss = StratifiedShuffleSplit(n_splits = n_splits, test_size=Test_Size, random_state=42)
if isinstance(X, pd.DataFrame):
X = X.values
if isinstance(y, pd.Series):
y = y.values
_ = sss.get_n_splits(X, y)
Reports_Train = []
Reports_Test = []
CM_Train = []
CM_Test = []
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
_ = model.fit(X_train,y_train)
# Train
y_pred = model.predict(X_train)
R = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=Labels, output_dict=True)).T
Reports_Train.append(R.values)
CM_Train.append(metrics.confusion_matrix(y_train, y_pred))
# Test
y_pred = model.predict(X_test)
R = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=Labels, output_dict=True)).T
Reports_Test.append(R.values)
CM_Test.append(metrics.confusion_matrix(y_test, y_pred))
# Train
ALL = Reports_Train[0].ravel()
CM = CM_Train[0].ravel()
for i in range(1, len(Reports_Train)):
ALL = np.vstack((ALL, Reports_Train[i].ravel()))
CM = np.vstack((CM, CM_Train[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Train = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Train = CM.mean(axis = 0).reshape(CM_Train[0].shape).round(0).astype(int)
del ALL, Mean, STD
# Test
ALL = Reports_Test[0].ravel()
CM = CM_Test[0].ravel()
for i in range(1, len(Reports_Test)):
ALL = np.vstack((ALL, Reports_Test[i].ravel()))
CM = np.vstack((CM, CM_Test[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Test = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Test = CM.mean(axis = 0).reshape(CM_Test[0].shape).round(0).astype(int)
del ALL, Mean, STD
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set (CV = % i)' % n_splits})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set (CV = % i)' % n_splits})
return Reports_Train, Reports_Test, CM_Train, CM_Test
def Confusion_Mat(CM_Train, CM_Test, n_splits = 10):
# Font
font = FontProperties()
font.set_weight('bold')
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
Cmap = ['Greens', 'YlGn','Blues', 'PuBu']
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
fig.suptitle(Titles[i], fontproperties=font, fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": 14}, cmap=Cmap[2*i], ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": 1})
_ = ax[0].set_title('Confusion Matrix');
_ = sns.heatmap(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis],
annot=True, annot_kws={"size": 14}, cmap=Cmap[2*i+1], ax = ax[1],
linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(Labels)
_ = a.yaxis.set_ticklabels(Labels)
_ = a.set_aspect(1)
Some of the metrics that we use here to mesure the accuracy: \begin{align} \text{Confusion Matrix} = \begin{bmatrix}T_p & F_p\\ F_n & T_n\end{bmatrix}. \end{align}
where $T_p$, $T_n$, $F_p$, and $F_n$ represent true positive, true negative, false positive, and false negative, respectively.
\begin{align} \text{Precision} &= \frac{T_{p}}{T_{p} + F_{p}},\\ \text{Recall} &= \frac{T_{p}}{T_{p} + F_{n}},\\ \text{F1} &= \frac{2 \times \text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}\\ \text{Balanced-Accuracy (bACC)} &= \frac{1}{2}\left( \frac{T_{p}}{T_{p} + F_{n}} + \frac{T_{n}}{T_{n} + F_{p}}\right ) \end{align}The accuracy can be a misleading metric for imbalanced data sets. In these cases, a balanced accuracy (bACC) [4] is recommended that normalizes true positive and true negative predictions by the number of positive and negative samples, respectively, and divides their sum by two.
Header('Random Forest Classifier with Default Parameters')
n_splits = 20
RFC= RandomForestClassifier()
print('Default Parameters = %s' % RFC.get_params(deep=True))
_ = RFC.fit(X_train, y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(RFC, X = X, y = y, n_splits = n_splits)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'RoyalBlue', 'color': 'White'}))
Confusion_Mat(CM_Train, CM_Test, n_splits = n_splits)
Header('Train Set', C = 'Green')
tn, fp, fn, tp = CM_Train.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
print('Precision (Train) = %.2f' % Precision)
print('Recall (Train) = %.2f' % Recall)
print('TPR (Train) = %.2f' % TPR)
print('TNR (Train) = %.2f' % TNR)
print('Balanced Accuracy (Train) = %.2f' % BA)
Header('Test Set')
tn, fp, fn, tp = CM_Test.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Test) = %.2f' % Precision)
print('Recall (Test) = %.2f' % Recall)
print('TPR (Test) = %.2f' % TPR)
print('TNR (Test) = %.2f' % TNR)
print('Balanced Accuracy (Test) = %.2f' % BA)
del tn, fp, fn, tp, Precision, Recall, TPR, TNR, BA
Line()
Random Forest Classifier with Default Parameters =================================================== Default Parameters = {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| Malignant | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 148.0000 ± 0.0000 |
| Benign | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 250.0000 ± 0.0000 |
| accuracy | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 |
| macro avg | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 398.0000 ± 0.0000 |
| weighted avg | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 398.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| Malignant | 0.9555 ± 0.0193 | 0.9422 ± 0.0262 | 0.9484 ± 0.0126 | 64.0000 ± 0.0000 |
| Benign | 0.9660 ± 0.0149 | 0.9734 ± 0.0119 | 0.9695 ± 0.0072 | 107.0000 ± 0.0000 |
| accuracy | 0.9617 ± 0.0091 | 0.9617 ± 0.0091 | 0.9617 ± 0.0091 | 0.9617 ± 0.0091 |
| macro avg | 0.9607 ± 0.0096 | 0.9578 ± 0.0116 | 0.9590 ± 0.0099 | 171.0000 ± 0.0000 |
| weighted avg | 0.9621 ± 0.0093 | 0.9617 ± 0.0091 | 0.9616 ± 0.0092 | 171.0000 ± 0.0000 |
Train Set ========================================================================================== Precision (Train) = 1.00 Recall (Train) = 1.00 TPR (Train) = 1.00 TNR (Train) = 1.00 Balanced Accuracy (Train) = 1.00 Test Set =========================================================================================== Precision (Test) = 0.96 Recall (Test) = 0.97 TPR (Test) = 0.97 TNR (Test) = 0.94 Balanced Accuracy (Test) = 0.95 ====================================================================================================
In order to find the parameters for our model, we can sue RandomizedSearchCV. Here, we have defined a function Best_Parm to find the best parameters.
RFC= RandomForestClassifier()
param_dist = dict(n_estimators = [50, 100, 200],
max_depth= [None, 3, 5, 7, 10],
min_samples_leaf= [1, 0.1, 0.01, 0.001])
Header('Random Forest Classifier with the Best Parameters')
grid = Best_Parm(model = RFC, param_dist = param_dist, Top = 20, H = 850, titleY =.96)
Random Forest Classifier with the Best Parameters ==================================================
| Rank Test Score | Params | Train Score | Test Score | Fit Time |
|---|---|---|---|---|
| 1 | n_estimators: 200, min_samples_leaf: 1, max_depth: 10 | 1.00e+00 ± 0.00e+00 | 9.67e-01 ± 1.59e-02 | 2.67e-01 ± 3.11e-02 |
| 2 | n_estimators: 50, min_samples_leaf: 0.001, max_depth: None | 1.00e+00 ± 0.00e+00 | 9.66e-01 ± 1.45e-02 | 6.48e-02 ± 5.39e-03 |
| 3 | n_estimators: 100, min_samples_leaf: 1, max_depth: 10 | 1.00e+00 ± 0.00e+00 | 9.66e-01 ± 1.43e-02 | 1.33e-01 ± 1.12e-02 |
| 4 | n_estimators: 200, min_samples_leaf: 0.001, max_depth: None | 1.00e+00 ± 0.00e+00 | 9.66e-01 ± 1.57e-02 | 2.53e-01 ± 9.03e-03 |
| 5 | n_estimators: 100, min_samples_leaf: 0.001, max_depth: 7 | 9.99e-01 ± 1.83e-03 | 9.66e-01 ± 1.48e-02 | 1.30e-01 ± 6.55e-03 |
| 6 | n_estimators: 100, min_samples_leaf: 0.001, max_depth: 10 | 1.00e+00 ± 0.00e+00 | 9.66e-01 ± 1.63e-02 | 1.25e-01 ± 4.98e-03 |
| 7 | n_estimators: 50, min_samples_leaf: 1, max_depth: 10 | 1.00e+00 ± 0.00e+00 | 9.65e-01 ± 1.49e-02 | 6.41e-02 ± 2.76e-03 |
| 8 | n_estimators: 200, min_samples_leaf: 0.001, max_depth: 7 | 9.99e-01 ± 1.59e-03 | 9.65e-01 ± 1.65e-02 | 2.53e-01 ± 9.73e-03 |
| 9 | n_estimators: 100, min_samples_leaf: 1, max_depth: None | 1.00e+00 ± 0.00e+00 | 9.65e-01 ± 1.46e-02 | 1.37e-01 ± 8.98e-03 |
| 10 | n_estimators: 50, min_samples_leaf: 0.001, max_depth: 7 | 9.99e-01 ± 2.21e-03 | 9.65e-01 ± 1.82e-02 | 6.58e-02 ± 5.44e-03 |
| 11 | n_estimators: 200, min_samples_leaf: 1, max_depth: None | 1.00e+00 ± 0.00e+00 | 9.65e-01 ± 1.65e-02 | 2.54e-01 ± 9.25e-03 |
| 12 | n_estimators: 100, min_samples_leaf: 1, max_depth: 7 | 1.00e+00 ± 1.20e-03 | 9.65e-01 ± 1.29e-02 | 1.24e-01 ± 3.67e-03 |
| 13 | n_estimators: 200, min_samples_leaf: 0.01, max_depth: 10 | 9.85e-01 ± 4.31e-03 | 9.65e-01 ± 1.65e-02 | 2.46e-01 ± 8.83e-03 |
| 14 | n_estimators: 100, min_samples_leaf: 0.001, max_depth: None | 1.00e+00 ± 0.00e+00 | 9.65e-01 ± 1.26e-02 | 1.30e-01 ± 1.05e-02 |
| 15 | n_estimators: 100, min_samples_leaf: 1, max_depth: 5 | 9.93e-01 ± 2.95e-03 | 9.64e-01 ± 1.63e-02 | 1.23e-01 ± 5.51e-03 |
| 16 | n_estimators: 200, min_samples_leaf: 0.001, max_depth: 10 | 1.00e+00 ± 0.00e+00 | 9.64e-01 ± 1.63e-02 | 2.50e-01 ± 7.93e-03 |
| 17 | n_estimators: 50, min_samples_leaf: 1, max_depth: None | 1.00e+00 ± 8.68e-04 | 9.64e-01 ± 1.52e-02 | 6.72e-02 ± 2.91e-03 |
| 18 | n_estimators: 200, min_samples_leaf: 1, max_depth: 7 | 9.99e-01 ± 1.42e-03 | 9.64e-01 ± 1.57e-02 | 2.60e-01 ± 1.05e-02 |
| 19 | n_estimators: 50, min_samples_leaf: 1, max_depth: 7 | 9.99e-01 ± 1.83e-03 | 9.64e-01 ± 1.38e-02 | 6.52e-02 ± 3.43e-03 |
| 20 | n_estimators: 50, min_samples_leaf: 1, max_depth: 5 | 9.93e-01 ± 3.71e-03 | 9.64e-01 ± 1.33e-02 | 6.48e-02 ± 3.98e-03 |
Since we have identified the best parameters for our modeling, we train another model using these parameters.
Header('Random Forest Classifier with the Best Parameters')
RFC = RandomForestClassifier(**grid.best_params_)
print('Default Parameters = %s' % RFC.get_params(deep=True))
_ = RFC.fit(X_train, y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(RFC, X = X, y = y, n_splits = 20)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'DarkGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'MediumBlue', 'color': 'White'}))
Confusion_Mat(CM_Train, CM_Test, n_splits = 20)
Header('Train Set', C = 'Green')
tn, fp, fn, tp = CM_Train.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
print('Precision (Train) = %.2f' % Precision)
print('Recall (Train) = %.2f' % Recall)
print('TPR (Train) = %.2f' % TPR)
print('TNR (Train) = %.2f' % TNR)
print('Balanced Accuracy (Train) = %.2f' % BA)
Header('Test Set')
tn, fp, fn, tp = CM_Test.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Test) = %.2f' % Precision)
print('Recall (Test) = %.2f' % Recall)
print('TPR (Test) = %.2f' % TPR)
print('TNR (Test) = %.2f' % TNR)
print('Balanced Accuracy (Test) = %.2f' % BA)
del tn, fp, fn, tp, Precision, Recall, TPR, TNR, BA
Line()
Random Forest Classifier with the Best Parameters ================================================== Default Parameters = {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| Malignant | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 148.0000 ± 0.0000 |
| Benign | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 250.0000 ± 0.0000 |
| accuracy | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 |
| macro avg | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 398.0000 ± 0.0000 |
| weighted avg | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 398.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| Malignant | 0.9568 ± 0.0192 | 0.9375 ± 0.0271 | 0.9466 ± 0.0124 | 64.0000 ± 0.0000 |
| Benign | 0.9634 ± 0.0151 | 0.9743 ± 0.0121 | 0.9687 ± 0.0069 | 107.0000 ± 0.0000 |
| accuracy | 0.9605 ± 0.0088 | 0.9605 ± 0.0088 | 0.9605 ± 0.0088 | 0.9605 ± 0.0088 |
| macro avg | 0.9601 ± 0.0090 | 0.9559 ± 0.0115 | 0.9577 ± 0.0096 | 171.0000 ± 0.0000 |
| weighted avg | 0.9609 ± 0.0088 | 0.9605 ± 0.0088 | 0.9604 ± 0.0089 | 171.0000 ± 0.0000 |
Train Set ========================================================================================== Precision (Train) = 1.00 Recall (Train) = 1.00 TPR (Train) = 1.00 TNR (Train) = 1.00 Balanced Accuracy (Train) = 1.00 Test Set =========================================================================================== Precision (Test) = 0.96 Recall (Test) = 0.97 TPR (Test) = 0.97 TNR (Test) = 0.94 Balanced Accuracy (Test) = 0.95 ====================================================================================================